*********************************************
*	Author: Rithika Kumar	                *
*   GOAL: Setting up data for Appendix F2   *        
*   Table A13                               * 
*********************************************

********************************************************************************************************
* NOTE: Compiling Data from IHDS individual and household datasets for the salary analysis. 
* This data is not in the EW dataser and needs to be complied by merging multiple datasets 
* from the IHDS 
* I use the EW file: main_ew_hh_df.dta as the base and use it to match with hhouseholds and invidiual
* IDs in the other files 
********************************************************************************************************


* Set working directory to "JOP Replication files" folder on your computer 

use "DATA FILES TO SHARE/IHDS_RAW/37382-0005-Data.dta", clear

** Keep only the relevant variables you are interested in


keep SURVEY HHBASE PBASE HHFAM2 HHFAM2012 STATEID DISTID PSUID HHID HHSPLITID PERSONID IDPERSON PID2005 PID2012 HHID2005 HHID2012 HHSPLITID2005 HHSPLITID2012 HHWAVES PWAVES STDIST01 PSUWAVES URBAN URBAN2001 URBAN4 URBAN4_2001 URBAN2011 URBAN4_2011 METRO METRO6 RO0 RO1 RO1ID1993 RO3 RO4 RO5 RO6 RO7 RO8 RO9 RO10 FM1 FM36Y WS2Y WKNONAG WKAGLAB NF1 WKDAYS WKHOURS WKANY FMDAYS WSDAYS WS5 WT2005 WT FWT HHPBASE

** create a variable with the year of the survey 
gen year_round = 0 
replace year_round = 2005 if SURVEY == 1
replace year_round = 2012 if SURVEY == 2

** I am going to create a unique ID based on the HHID and PERSONID in 2005 wave. 
** these IDs can differ but the data tells us the IDS in both waves so we just use one of them to create a unique ID 
** this is just to check the unique IDS 
egen puid_noyr = concat(STATEID DISTID PSUID HHID2005 HHSPLITID2005 PID2005) 
** HOW MANY UNIQUE -- CHECK 
 egen x = group(puid_noyr)

sum x
** We have  149373 unique - i.e. these people exist in the first and second round. 



** Now we want to create a variable that will indeitfy them based on if they were in first or second and will create a unique ID for each of them. 
egen puid = concat(STATEID DISTID PSUID HHID2005 HHSPLITID2005 PID2005 year_round) 

** let's identify the ones that have more than one value. we ideally want only one of each bc I have now created a unique ID for each hh base don the wave even though same person was interviewed in both rounds.
** there couldhave been two values attributed to the same person in eachround and we want to remove those 

sort  puid
quietly by  puid:  gen dup = cond(_N==1,0,_n)
tab dup


** let's drop the ones that are not unique
** 295533 are unique vals with no dups 

drop if dup >0 

** we need to check if now based on the UID for each person (which should repeat twice using puid_noyr)
sort  puid_noyr
quietly by    puid_noyr:  gen dup_2 = cond(_N==1,0,_n)
tab dup_2

**works PERFECT!!! JUST NEED TO DROP the 0 
drop if dup_2==0

** Obs = 295,528

tempfile wage_lab_merge
save `wage_lab_merge'

clear

*** *** Now with this data and the person ID, match this to the eligible women data. 
use "DATA FILES TO SHARE/TEMP_FILES/main_ew_hh_df.dta"


gen year_round = 0
replace year_round = 2005 if year == 0
replace year_round = 2012 if year == 1

** create a UID with the year 
drop puid
egen puid = concat(STATEID DISTID PSUID HHID2005 HHSPLITID2005 PERSONID2005 year_round) 

sort  puid
quietly by  puid:  gen dup = cond(_N==1,0,_n)
tab dup

** let's drop the ones that are not unique
** 47340 are unique vals with no dups 

drop if dup >0 

** we need to check if now based on the UID for each person (which should repeat twice using puid_noyr)
egen puid_noyr = concat(STATEID DISTID PSUID HHID2005 HHSPLITID2005 PERSONID2005) 

sort  puid_noyr
quietly by puid_noyr:  gen dup_2 = cond(_N==1,0,_n)
tab dup_2



*** now let's merge this to the file with the wage data 

*use "/Users/rithika/Dropbox/Dissertation/Chapter 3/Updated code/Updates to Data subset/wage_lab_analysis/Data_clean/wage_lab_tomerge.dta"

drop _merge

merge 1:1 puid using `wage_lab_merge'
** let's only keep the merged data

keep if _merge==3


drop z
egen z = group(hhuid_num)
sum z


*** 1. Get the mean of Farm labor non-agri labor, Non farm business 
gen farm_lab = 0
replace farm_lab = 1 if WKAGLAB >0 

** we want to just replace the number of days worked on the farm to 0 for those who did not work on their own farms 
replace FMDAYS = 0 if FM36Y == 0  
mean FMDAYS farm_lab WKNONAG  NF1

replace EW_health = . if EW_health<0
  
replace EW_child = . if EW_child<0
replace EW_edu = . if EW_edu<0
replace EW_Age = . if EW_Age<0

tab w2_abshusband_dummy
/*
w2_abshusba |
   nd_dummy |      Freq.     Percent        Cum.
------------+-----------------------------------
          0 |     46,144       95.29       95.29
          1 |      2,280        4.71      100.00
------------+-----------------------------------
      Total |     48,424      100.00
*/


save "DATA FILES TO SHARE/TEMP_FILES/wage_lab_analysis.dta", replace 

keep year  w2_abshusband_dummy FMDAYS farm_lab WKNONAG NF1 hhuid_num year EW_Age EW_health dil_dummy INCOME anotherst_dummy URBAN did_sample IDPSU FM1 

save "DATA FILES TO SHARE/ihds_wages.dta", replace





***** Salary Work Analysis ******


use "DATA FILES TO SHARE/TEMP_FILES/main_ew_hh_df.dta",clear

** create a new UID that uses the HHUID I created and adds the PERSON ID from 2005 to it 
format PERSONID2005 %02.0f
gen str12 uid_person_using2005 = string(STATEID,"%02.0f") + string(DISTID,"%02.0f") + string(PSUID,"%02.0f") + string(HHID2005,"%02.0f") + string(HHSPLITID2005,"%02.0f") + string(PERSONID2005,"%02.0f")
egen check = group(uid_person_using2005)
sum check

keep if year == 0
keep  uid_person_using2005 w2_abshusband_dummy 
tempfile did
save `did'


** Import the pooled DATA for the INDIVIDUAL QUESTIONNAIRE 
clear
use "DATA FILES TO SHARE/IHDS_RAW/37382-0001-Data.dta"


gen year_round = 0 
replace year_round = 2005 if SURVEY == 1
replace year_round = 2012 if SURVEY == 2

tab year_round
 
** create a new UID that uses the HHUID I created and adds the PERSON ID to it 
format PERSONID %02.0f
format PID2005 %02.0f

** create leading zeros because when we concat it won't cause an isue 
** create leading zeros because when we concat it won't cause an isue 
format DISTID %02.0f
format PSUID %02.0f
format HHID2005 %03.0f
format HHSPLITID2005 %02.0f

** we only want individuals who were in both waves
tab PWAVES, nolabel


keep if PWAVES == 11
tab year_round

** create a uid with the state, dist, psu and hhid2005 hhsplit2005
** SO I am creating a unique ID for the HH based on the 2005 values. 

gen str10 hhuid_using2005 = string(STATEID,"%02.0f") + string(DISTID,"%02.0f") + string(PSUID,"%02.0f") + string(HHID2005,"%02.0f") + string(HHSPLITID2005,"%02.0f") 
egen x = group(hhuid_using2005)
sum x

drop x

** So we have 34,643 Households where we only include are individuals in that HH who have been surveyed in both waves 

*** now let's create a UID for the person based on 2005 

gen str12 uid_person_using2005 = string(STATEID,"%02.0f") + string(DISTID,"%02.0f") + string(PSUID,"%02.0f") + string(HHID2005,"%02.0f") + string(HHSPLITID2005,"%02.0f") + string(PID2005,"%02.0f")
egen x = group(uid_person_using2005)
sum x

** we have 150988 unique Individuals - so these people are in both waves 

*** first let us just identify the ones in the eligible women's module 

merge m:1 uid_person_using2005 using `did'


keep if _merge == 3

tab SURVEY


/*


      IHDS1 |
  (2005) or |
      IHDS2 |
     (2012) |      Freq.     Percent        Cum.
------------+-----------------------------------
    IHDS1 1 |     24,866       50.00       50.00
    IHDS2 2 |     24,865       50.00      100.00
------------+-----------------------------------
      Total |     49,731      100.00


*/

save "DATA FILES TO SHARE/TEMP_FILES/inc_earned_EW_analysis", replace
 
gen year = 0 
replace year = 1 if year_round == 2012

 destring uid_person_using2005, generate(uid_person_num)
by uid_person_num, sort: egen ws_both_wave = total(WS2Y)
tab ws_both_wave

** for our analysis we only want those who worked in both wavesg. 


tab w2_abshusband_dummy ws_both_wave


sort w2_abshusband_dummy
by w2_abshusband_dummy: summ  ws_both_wave

keep SALARYEARN w2_abshusband_dummy year URBAN ws_both_wave IDPSU DISTID hhuid_using2005 

save "DATA FILES TO SHARE/inc_earned_EW_analysis.dta", replace




